diff --git a/CHANGELOG.md b/CHANGELOG.md index 4d4f0945b..7a21bf0c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Update `register_interface` to 0.4.3 - Updated Halide to version 15 - Move instruction cache into its own dependency +- Add Logk tree barrier and barrier with a fraction of linear and Log2 arrival +- Add registers to wakeup cores with stride and offset +- Add barrier with stride and offset ### Fixed - Fix type issue in `snitch_addr_demux` diff --git a/hardware/scripts/gen_benchmark_table.py b/hardware/scripts/gen_benchmark_table.py index 520595c12..12e1e90b0 100755 --- a/hardware/scripts/gen_benchmark_table.py +++ b/hardware/scripts/gen_benchmark_table.py @@ -18,20 +18,44 @@ def create_dataframe(directory: str): keys = ['cycles', + 'max_cycles', + 'min_cycles', + 'std_cycles', 'snitch_loads', 'snitch_stores', 'snitch_avg_load_latency', 'snitch_occupancy', 'total_ipc', - 'snitch_issues ', + 'snitch_issues', + 'max_snitch_issues', + 'min_snitch_issues', + 'std_snitch_issues', 'stall_tot', + 'max_stall_tot', + 'min_stall_tot', + 'std_stall_tot', 'stall_ins', + 'max_stall_ins', + 'min_stall_ins', + 'std_stall_ins', 'stall_raw', + 'max_stall_raw', + 'min_stall_raw', + 'std_stall_raw', 'stall_raw_lsu', 'stall_raw_acc', 'stall_lsu', + 'max_stall_lsu', + 'min_stall_lsu', + 'std_stall_lsu', 'stall_acc', + 'max_stall_acc', + 'min_stall_acc', + 'std_stall_acc', 'stall_wfi', + 'max_stall_wfi', + 'min_stall_wfi', + 'std_stall_wfi', 'seq_loads_local', 'seq_loads_global', 'itl_loads_local', @@ -48,22 +72,20 @@ def create_dataframe(directory: str): path = os.getcwd() df = pd.DataFrame(index=keys) for subdir in os.listdir(path): - filename = os.path.join(subdir, 'avg.txt') + filename = os.path.join(subdir, 'max.txt') filetext = open(filename).read() values = [] for key in keys: values.append( re.findall( - r'%s\s*[+-]?([0-9]*[.]?[0-9]+)' % + r'\b%s\b\s*[+-]?([0-9]*[.]?[0-9]+)' % (key), filetext)) df[subdir] = (np.asarray(values)).flatten() return df def main(): - script_path = pathlib.Path(__file__).parent.absolute() - # Parse arguments parser = argparse.ArgumentParser( description='Extract performance data from log files') @@ -89,7 +111,6 @@ def main(): action='store_true', help='Set verbose' ) - args = parser.parse_args() df = create_dataframe(args.input) df.to_excel(os.path.join(args.output, 'table.xls')) diff --git a/hardware/scripts/gen_max.py b/hardware/scripts/gen_max.py new file mode 100644 index 000000000..7c3f9131c --- /dev/null +++ b/hardware/scripts/gen_max.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 + +# Copyright 2022 ETH Zurich and University of Bologna. +# Solderpad Hardware License, Version 0.51, see LICENSE for details. +# SPDX-License-Identifier: SHL-0.51 + +# This script takes a set of .csv files in one of the results folders and +# generates the average and the max/min performance. +# Author: Marco Bertuletti + +import os +import pandas as pd +import numpy as np +import argparse +import sys + +ext = ('.csv') + +parser = argparse.ArgumentParser() +parser.add_argument( + '--folder', + '-f', + help='Name of the results folder with traces to be averaged.' +) +args = parser.parse_args() + +os.chdir(args.folder) +path = os.getcwd() +print(path) + +for subdir in os.listdir(path): + subdir_path = os.path.join(path, subdir) + os.chdir(subdir_path) + print(subdir_path) + for files in os.listdir(subdir_path): + if files.endswith(ext): + csvread = pd.read_csv(files) + orig_stdout = sys.stdout + f = open('max.txt', 'w') + sys.stdout = f + + print("\n") + print("*******************************") + print("** AVERAGE PERFORMANCE **") + print("*******************************") + print("") + + for section in set(csvread['section']): + print("Section %d:\n" % section) + sectionread = csvread.loc[csvread['section'] == section] + keys = csvread.columns + remove_keys = ['core', + 'section', + 'start', + 'end', + 'snitch_load_latency', + 'snitch_load_region', + 'snitch_load_tile', + 'snitch_store_region', + 'snitch_store_tile'] + keys = keys.drop(remove_keys, errors='raise') + for key in keys: + try: + column = sectionread[key].replace(np.nan, 0) + column = column.to_numpy() + avg = np.average(column) + if key in ['cycles', + 'snitch_issues', + 'stall_tot', + 'stall_ins', + 'stall_raw', + 'stall_lsu', + 'stall_acc', + 'stall_wfi']: + max_val = np.max(column) + min_val = np.min(column) + std_val = np.std(column) + except Exception: + # Key could not be averaged + continue + print("%-30s %4.4f" % (key, avg)) + if key in ['cycles', + 'snitch_issues', + 'stall_tot', + 'stall_ins', + 'stall_raw', + 'stall_lsu', + 'stall_acc', + 'stall_wfi']: + print("%-30s %4.4f" % (('max_' + key), max_val)) + print("%-30s %4.4f" % (('min_' + key), min_val)) + print("%-30s %4.4f" % (('std_' + key), std_val)) + sys.stdout = orig_stdout + f.close() + os.chdir(args.folder) diff --git a/hardware/src/ctrl_registers.sv b/hardware/src/ctrl_registers.sv index 189c76fc0..08fa82e86 100644 --- a/hardware/src/ctrl_registers.sv +++ b/hardware/src/ctrl_registers.sv @@ -49,21 +49,23 @@ module ctrl_registers // [3 :0 ]:eoc_reg (rw) // [7 :4 ]:wake_up_reg (rw) // [11:8 ]:wake_up_group_reg (rw) - // [15:12]:tcdm_start_adress_reg (ro) - // [19:16]:tcdm_end_address_reg (ro) - // [23:20]:nr_cores_address_reg (ro) - // [27:24]:ro_cache_enable (rw) - // [31:28]:ro_cache_flush (rw) - // [35:32]:ro_cache_start_0 (rw) - // [39:36]:ro_cache_end_0 (rw) - // [43:40]:ro_cache_start_1 (rw) - // [47:44]:ro_cache_end_1 (rw) - // [51:48]:ro_cache_start_2 (rw) - // [55:52]:ro_cache_end_2 (rw) - // [59:56]:ro_cache_start_3 (rw) - // [63:60]:ro_cache_end_3 (rw) - - // [95:64]:wake_up_tile[7:0] (rw) + // [15:12]:wake_up_stride_reg (rw) + // [19:16]:wake_up_offset_reg (rw) + // [23:20]:tcdm_start_adress_reg (ro) + // [27:24]:tcdm_end_address_reg (ro) + // [31:28]:nr_cores_address_reg (ro) + // [35:32]:ro_cache_enable (rw) + // [39:36]:ro_cache_flush (rw) + // [43:40]:ro_cache_start_0 (rw) + // [47:44]:ro_cache_end_0 (rw) + // [51:48]:ro_cache_start_1 (rw) + // [55:52]:ro_cache_end_1 (rw) + // [59:56]:ro_cache_start_2 (rw) + // [63:60]:ro_cache_end_2 (rw) + // [67:64]:ro_cache_start_3 (rw) + // [71:68]:ro_cache_end_3 (rw) + + // [103:72]:wake_up_tile[7:0] (rw) localparam logic [MAX_NumGroups*DataWidth-1:0] RegRstVal_TileWakeUp = '{MAX_NumGroups*DataWidth{1'b0}}; localparam logic [NumRegs-MAX_NumGroups-1:0][DataWidth-1:0] RegRstVal = '{ @@ -81,6 +83,8 @@ module ctrl_registers TCDMBaseAddr + TCDMSize, TCDMBaseAddr, {DataWidth{1'b0}}, + 32'b1, + {DataWidth{1'b0}}, {DataWidth{1'b0}}, {DataWidth{1'b0}} }; @@ -102,6 +106,8 @@ module ctrl_registers ReadOnlyReg, ReadWriteReg, ReadWriteReg, + ReadWriteReg, + ReadWriteReg, ReadWriteReg }; @@ -111,6 +117,8 @@ module ctrl_registers logic [DataWidth-1:0] eoc; logic [DataWidth-1:0] wake_up; logic [DataWidth-1:0] wake_up_group; + logic [DataWidth-1:0] wake_up_stride; + logic [DataWidth-1:0] wake_up_offset; logic [DataWidth-1:0] tcdm_start_address; logic [DataWidth-1:0] tcdm_end_address; logic [DataWidth-1:0] num_cores; @@ -125,6 +133,7 @@ module ctrl_registers logic [DataWidth-1:0] ro_cache_start_3; logic [DataWidth-1:0] ro_cache_end_3; logic [MAX_NumGroups*DataWidth-1:0] wake_up_tile; + logic [NumCores-1:0] wake_up_mask; logic [RegNumBytes-1:0] wr_active_d; logic [RegNumBytes-1:0] wr_active_q; @@ -152,7 +161,8 @@ module ctrl_registers ro_cache_end_1, ro_cache_start_1, ro_cache_end_0, ro_cache_start_0, ro_cache_flush, ro_cache_enable, - num_cores, tcdm_end_address, tcdm_start_address, wake_up_group, wake_up, eoc }) + num_cores, tcdm_end_address, tcdm_start_address, + wake_up_offset, wake_up_stride, wake_up_group, wake_up, eoc }) ); /*************** @@ -177,12 +187,20 @@ module ctrl_registers always_comb begin wake_up_o = '0; + wake_up_mask = '0; + + // create mask for wake_up with stride and offset + for(int i = wake_up_offset; i < NumCores; i = i + wake_up_stride) begin + wake_up_mask[i] = 1; + end + // converts 32 bit wake up to 256 bit if (wr_active_q[7:4]) begin if (wake_up < NumCores) begin wake_up_o = 1 << wake_up; end else if (wake_up == {DataWidth{1'b1}}) begin wake_up_o = {NumCores{1'b1}}; + wake_up_o = wake_up_o & wake_up_mask; end end // converts 32 bit group wake up mask to 256 bit core wake up mask @@ -191,19 +209,21 @@ module ctrl_registers for(int i = 0; i < NumGroups; i = i + 1) begin wake_up_o[NumCoresPerGroup * i +: NumCoresPerGroup] = {NumCoresPerGroup{wake_up_group[i]}}; end + wake_up_o = wake_up_o & wake_up_mask; end else if (wake_up_group == {DataWidth{1'b1}}) begin wake_up_o = {NumCores{1'b1}}; + wake_up_o = wake_up_o & wake_up_mask; end end - // converts 32 bit tile wake up mask to 256 bit core wake up mask for(int i_g = 0; i_g < NumGroups; i_g = i_g + 1) begin - if (wr_active_q[64 + 4 * i_g +: 4]) begin + if (wr_active_q[72 + 4 * i_g +: 4]) begin if (wake_up_tile[i_g * DataWidth +: DataWidth] <= {NumTilesPerGroup{1'b1}}) begin for (int i = 0; i < NumTilesPerGroup; i = i + 1) begin wake_up_o[NumCoresPerGroup * i_g + NumCoresPerTile * i +: NumCoresPerTile] = {NumCoresPerTile{wake_up_tile[i_g * DataWidth + i]}}; end + wake_up_o = wake_up_o & wake_up_mask; end end diff --git a/hardware/src/mempool_system.sv b/hardware/src/mempool_system.sv index 7bddbe17e..1339762d4 100644 --- a/hardware/src/mempool_system.sv +++ b/hardware/src/mempool_system.sv @@ -718,7 +718,7 @@ module mempool_system ); ctrl_registers #( - .NumRegs (16 + 8 ), + .NumRegs (18 + 8 ), .TCDMBaseAddr (TCDMBaseAddr ), .TCDMSize (TCDMSize ), .NumCores (NumCores ), diff --git a/software/data/data_barriers_test.h.tpl b/software/data/data_barriers_test.h.tpl new file mode 100644 index 000000000..5b033c43a --- /dev/null +++ b/software/data/data_barriers_test.h.tpl @@ -0,0 +1,18 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +\ +<% def array_to_cstr(array): + out = '{' + i = 0 + out += '\n' + for a in array: + out += '{}, '.format(a) + i += 1 + if i % 8 == 0: + out += '\n' + out = out[:-2] + '}' + return out +%> \ + +uint32_t core_delays[${num_cores}] = ${array_to_cstr(delays)}; diff --git a/software/data/data_barriers_test.py b/software/data/data_barriers_test.py new file mode 100755 index 000000000..59b43f755 --- /dev/null +++ b/software/data/data_barriers_test.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 + +# Copyright 2022 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# Author: Marco Bertuletti, ETH Zurich + +import numpy as np +import argparse +import pathlib +from mako.template import Template + +################## +# compute_result # +################## + + +def gen_data_header_file( + outdir: pathlib.Path.cwd(), + tpl: pathlib.Path.cwd(), + **kwargs): + file = outdir / f"data_{kwargs['name']}.h" + print(tpl, outdir, kwargs['name']) + template = Template(filename=str(tpl)) + with file.open('w') as f: + f.write(template.render(**kwargs)) + + +def main(): + parser = argparse.ArgumentParser(description='Generate data for kernels') + parser.add_argument( + "-o", + "--outdir", + type=pathlib.Path, + default=pathlib.Path(__file__).parent.absolute(), + required=False, + help='Select out directory of generated data files' + ) + parser.add_argument( + "-t", + "--tpl", + type=pathlib.Path, + required=False, + default=pathlib.Path(__file__).parent.absolute() + / "data_barriers_test.h.tpl", + help='Path to mako template' + ) + parser.add_argument( + "-v", + "--verbose", + action='store_true', + help='Set verbose' + ) + + parser.add_argument( + "delay_distribution", + nargs='?', + type=str, + default='uniform') + parser.add_argument( + "-n", + "--num_cores", + type=int, + required=False, + default=1024, + help='Number of cores.') + parser.add_argument( + "-a", + type=float, + required=False, + default=0.5, + help='Weybull a.') + parser.add_argument( + "-d", + type=int, + required=False, + default=1, + help='Weybull d.') + parser.add_argument( + "-m", + "--max", + type=int, + required=False, + default=1024, + help='Max delay.') + + args = parser.parse_args() + num_cores = args.num_cores + + if args.delay_distribution == 'weybull': + # Weybull distribution + a = args.a_par + D = args.d_par + delays = D * np.random.weibull(a, size=num_cores) + delays = np.asarray(delays, dtype='int') + else: + # Uniform + max_delay = args.max + delays = np.random.uniform(low=0.0, high=max_delay, size=num_cores) + delays = np.asarray(delays, dtype='int') + + kwargs = { + 'name': 'barriers_test', + 'delays': delays, + 'num_cores': num_cores} + gen_data_header_file(args.outdir, args.tpl, **kwargs) + + +if __name__ == "__main__": + main() diff --git a/software/runtime/arch.ld.c b/software/runtime/arch.ld.c index 8d003388b..6f9e8732a 100644 --- a/software/runtime/arch.ld.c +++ b/software/runtime/arch.ld.c @@ -35,28 +35,30 @@ SECTIONS { eoc_reg = 0x40000000; wake_up_reg = 0x40000004; wake_up_group_reg = 0x40000008; - tcdm_start_address_reg = 0x4000000C; - tcdm_end_address_reg = 0x40000010; - nr_cores_address_reg = 0x40000014; - ro_cache_enable = 0x40000018; - ro_cache_flush = 0x4000001C; - ro_cache_start_0 = 0x40000020; - ro_cache_end_0 = 0x40000024; - ro_cache_start_1 = 0x40000028; - ro_cache_end_1 = 0x4000002C; - ro_cache_start_2 = 0x40000030; - ro_cache_end_2 = 0x40000034; - ro_cache_start_3 = 0x40000038; - ro_cache_end_3 = 0x4000003C; - - wake_up_tile_g0_reg = 0x40000040; - wake_up_tile_g1_reg = 0x40000044; - wake_up_tile_g2_reg = 0x40000048; - wake_up_tile_g3_reg = 0x4000004C; - wake_up_tile_g4_reg = 0x40000050; - wake_up_tile_g5_reg = 0x40000054; - wake_up_tile_g6_reg = 0x40000058; - wake_up_tile_g7_reg = 0x4000005C; + wake_up_stride_reg = 0x4000000C; + wake_up_offset_reg = 0x40000010; + tcdm_start_address_reg = 0x40000014; + tcdm_end_address_reg = 0x40000018; + nr_cores_address_reg = 0x4000001C; + ro_cache_enable = 0x40000020; + ro_cache_flush = 0x40000024; + ro_cache_start_0 = 0x40000028; + ro_cache_end_0 = 0x4000002C; + ro_cache_start_1 = 0x40000030; + ro_cache_end_1 = 0x40000034; + ro_cache_start_2 = 0x40000038; + ro_cache_end_2 = 0x4000003C; + ro_cache_start_3 = 0x40000040; + ro_cache_end_3 = 0x40000044; + + wake_up_tile_g0_reg = 0x40000048; + wake_up_tile_g1_reg = 0x4000004C; + wake_up_tile_g2_reg = 0x40000050; + wake_up_tile_g3_reg = 0x40000054; + wake_up_tile_g4_reg = 0x40000058; + wake_up_tile_g5_reg = 0x4000005C; + wake_up_tile_g6_reg = 0x40000060; + wake_up_tile_g7_reg = 0x40000064; fake_uart = 0xC0000000; } diff --git a/software/runtime/runtime.h b/software/runtime/runtime.h index 4abdbd682..1d4df71e4 100644 --- a/software/runtime/runtime.h +++ b/software/runtime/runtime.h @@ -27,6 +27,9 @@ extern volatile uint32_t wake_up_tile_g5_reg; extern volatile uint32_t wake_up_tile_g6_reg; extern volatile uint32_t wake_up_tile_g7_reg; +extern volatile uint32_t wake_up_stride_reg; +extern volatile uint32_t wake_up_offset_reg; + typedef uint32_t mempool_id_t; typedef uint32_t mempool_timer_t; @@ -167,6 +170,13 @@ static inline void wake_up_tile(uint32_t group_id, uint32_t tile_mask) { } } +static inline void set_wake_up_stride(uint32_t stride) { + wake_up_stride_reg = stride; +} +static inline void set_wake_up_offset(uint32_t offset) { + wake_up_offset_reg = offset; +} + // Dump a value via CSR // This is only supported in simulation and an experimental feature. All writes // to unimplemented CSR registers will be dumped by Snitch. This can be diff --git a/software/runtime/synchronization.c b/software/runtime/synchronization.c index 88627b4d6..150fa1eed 100644 --- a/software/runtime/synchronization.c +++ b/software/runtime/synchronization.c @@ -10,6 +10,15 @@ #include "runtime.h" #include "synchronization.h" +#if NUM_CORES == (16) +#define LOG2_NUM_CORES (4) +#elif NUM_CORES == (256) +#define LOG2_NUM_CORES (8) +#elif NUM_CORES == (1024) +#define LOG2_NUM_CORES (10) + +#endif + uint32_t volatile barrier __attribute__((section(".l1"))); uint32_t volatile log_barrier[NUM_CORES * 4] __attribute__((aligned(NUM_CORES * 4), section(".l1"))); @@ -20,6 +29,8 @@ void mempool_barrier_init(uint32_t core_id) { if (core_id == 0) { // Initialize the barrier barrier = 0; + set_wake_up_stride(1U); + set_wake_up_offset(0U); wake_up_all(); mempool_wfi(); } else { @@ -33,6 +44,13 @@ void mempool_barrier_init(uint32_t core_id) { mempool_barrier(NUM_CORES); } +/* PLAIN BARRIER */ + +/** + @brief Central counter barrier + @param[in] num_cores Number of cores arriving at the barrier + @return none +*/ void mempool_barrier(uint32_t num_cores) { // Increment the barrier counter if ((num_cores - 1) == __atomic_fetch_add(&barrier, 1, __ATOMIC_RELAXED)) { @@ -45,12 +63,150 @@ void mempool_barrier(uint32_t num_cores) { mempool_wfi(); } +/** + @brief Central counter barrier with stride and offset + @param[in] barrier Pointer to the barrier variable (can be assigned + locally depending on the offset) + @param[in] num_cores Number of cores arriving at the barrier + @param[in] stride Stride between cores to wake up + @param[in] offset ID of the first core involved in the barrier + @return none +*/ +void mempool_strided_barrier(uint32_t *barrier, uint32_t num_cores, + uint32_t stride, uint32_t offset) { + + // Increment the barrier counter + if ((num_cores - 1) == __atomic_fetch_add(barrier, 1, __ATOMIC_RELAXED)) { + __atomic_store_n(barrier, 0, __ATOMIC_RELAXED); + __sync_synchronize(); // Full memory barrier + set_wake_up_stride(stride); + set_wake_up_offset(offset); + wake_up_all(); + set_wake_up_stride(1U); + set_wake_up_offset(0U); + } + mempool_wfi(); +} + +/* LOG BARRIER */ + +/** + @brief Log2 tree barrier + @param[in] step Step of the logarithmic tree (must be set to 2) + @param[in] core_id ID of the core arriving at the barrier + @return none +*/ void mempool_log_barrier(uint32_t step, uint32_t core_id) { + uint32_t idx = (step * (core_id / step)) * 4; + uint32_t next_step, previous_step; + uint32_t num_cores = mempool_get_core_count(); + previous_step = step >> 1; + if ((step - previous_step) == + __atomic_fetch_add(&log_barrier[idx + previous_step - 1], previous_step, + __ATOMIC_RELAXED)) { + next_step = step << 1; + __atomic_store_n(&log_barrier[idx + previous_step - 1], 0, + __ATOMIC_RELAXED); + if (num_cores == step) { + __sync_synchronize(); // Full memory barrier + wake_up_all(); + mempool_wfi(); + } else { + mempool_log_barrier(next_step, core_id); + } + } else + mempool_wfi(); +} + +/** + @brief Tree barrier with any radix. In each step a central counter + barrier is used. + @param[in] radix log2(barrier radix), e.g. radix 2 -> 1 + @param[in] core_id ID of the core arriving at the barrier + @return none +*/ +void mempool_anyradixlog_barrier(uint32_t radix, uint32_t core_id) { + uint32_t num_cores = mempool_get_core_count(); + uint32_t first_step = (LOG2_NUM_CORES % radix) == 0 + ? (1U << radix) + : 1U << (LOG2_NUM_CORES % radix); + uint32_t step = 0, previous_step = 0; + // At first step you take care of the remainder + uint32_t idx = (first_step * (core_id / first_step)) * 4; + if ((first_step - 1) == + __atomic_fetch_add(&log_barrier[idx], 1, __ATOMIC_RELAXED)) { + __atomic_store_n(&log_barrier[idx], 0, __ATOMIC_RELAXED); + num_cores /= first_step; + previous_step = first_step; + step = (first_step << radix); + // Following steps proceed with the radix chosen + while (num_cores > 1U) { + idx = (step * (core_id / step)) * 4; + if ((step - previous_step) == + __atomic_fetch_add(&log_barrier[idx + previous_step - 1], + previous_step, __ATOMIC_RELAXED)) { + __atomic_store_n(&log_barrier[idx + previous_step - 1], 0, + __ATOMIC_RELAXED); + num_cores >>= radix; + previous_step = step; + step <<= radix; + } else { + break; + } + } + // Last core wakes-up everyone + if (num_cores == 1U) { + __sync_synchronize(); // Full memory barrier + wake_up_all(); + } + } + mempool_wfi(); +} + +/** + @brief Central counter barrier on a subset of cores + log2 tree + barrier + @param[in] step Number of cores in central counter barrier + @param[in] core_id ID of the core arriving at the barrier + @return none +*/ +void mempool_linlog_barrier(uint32_t step, uint32_t core_id) { uint32_t idx = (step * (core_id / step)) * 4; uint32_t next_step, previous_step; uint32_t num_cores = mempool_get_core_count(); + previous_step = step >> 1; + if ((step - 1) == __atomic_fetch_add(&log_barrier[idx + previous_step - 1], 1, + __ATOMIC_RELAXED)) { + next_step = step << 1; + __atomic_store_n(&log_barrier[idx + previous_step - 1], 0, + __ATOMIC_RELAXED); + if (num_cores == step) { + __sync_synchronize(); // Full memory barrier + wake_up_all(); + mempool_wfi(); + } else { + mempool_log_barrier(next_step, core_id); + } + } else + mempool_wfi(); +} + +/** + @brief Log2 tree barrier with stride and offset + @param[in] step Step of the logarithmic tree (must be set to 2) + @param[in] stride Stride between cores to wake up + @param[in] offset ID of the first core involved in the barrier + @return none +*/ +void mempool_strided_log_barrier(uint32_t step, uint32_t core_id, + uint32_t stride, uint32_t offset) { + + uint32_t idx = (step * (core_id / step)) * 4 + offset; + uint32_t next_step, previous_step; + uint32_t num_cores = mempool_get_core_count(); + previous_step = step >> 1; if ((step - previous_step) == __atomic_fetch_add(&log_barrier[idx + previous_step - 1], previous_step, @@ -60,7 +216,11 @@ void mempool_log_barrier(uint32_t step, uint32_t core_id) { __ATOMIC_RELAXED); if (num_cores == step) { __sync_synchronize(); // Full memory barrier + set_wake_up_stride(stride); + set_wake_up_offset(offset); wake_up_all(); + set_wake_up_stride(1U); + set_wake_up_offset(0U); mempool_wfi(); } else { mempool_log_barrier(next_step, core_id); @@ -69,6 +229,15 @@ void mempool_log_barrier(uint32_t step, uint32_t core_id) { mempool_wfi(); } +/* PARTIAL BARRIER */ + +/** + @brief Log2 tree barrier on a subset of cores + @param[in] step Step of the logarithmic tree (must be set to 2) + @param[in] core_id ID of the first core involved in the barrier + @param[in] num_cores_barrier Number of cores involved in the barrier + @return none +*/ void mempool_log_partial_barrier(uint32_t step, uint32_t core_id, uint32_t num_cores_barrier) { @@ -119,6 +288,14 @@ void mempool_log_partial_barrier(uint32_t step, uint32_t core_id, } } +/** + @brief Central counter barrier on a subset of cores + @param[in] core_id ID of the first core involved in the barrier + @param[in] core_init First core involved in the barrier + @param[in] num_sleeping_cores Number of cores involved in the barrier + @param[in] memloc Location of the barrier variable + @return none +*/ void mempool_partial_barrier(uint32_t volatile core_id, uint32_t volatile core_init, uint32_t volatile num_sleeping_cores, diff --git a/software/runtime/synchronization.h b/software/runtime/synchronization.h index 80b79428f..75f8ce55c 100644 --- a/software/runtime/synchronization.h +++ b/software/runtime/synchronization.h @@ -11,8 +11,10 @@ void mempool_barrier_init(uint32_t core_id); void mempool_barrier(uint32_t num_cores); void mempool_log_barrier(uint32_t step, uint32_t core_id); +void mempool_anyradixlog_barrier(uint32_t radix, uint32_t core_id); void mempool_log_partial_barrier(uint32_t step, uint32_t core_id, uint32_t num_cores_barrier); +void mempool_linlog_barrier(uint32_t step, uint32_t core_id); void mempool_partial_barrier(uint32_t volatile core_id, uint32_t volatile core_init, uint32_t volatile num_sleeping_cores, diff --git a/software/tests/baremetal/barriers_test/main.c b/software/tests/baremetal/barriers_test/main.c new file mode 100644 index 000000000..de3222240 --- /dev/null +++ b/software/tests/baremetal/barriers_test/main.c @@ -0,0 +1,64 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +#include +#include +#include + +#include "encoding.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" + +#include "data_barriers_test.h" +#define LIN_LOG_BARRIER_DELAY + +dump(id, 1); + +int main() { + + uint32_t volatile core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + mempool_barrier_init(core_id); + mempool_barrier(num_cores); + +#if defined(PLAIN_BARRIER) + mempool_start_benchmark(); + mempool_barrier(num_cores); + mempool_stop_benchmark(); + +#elif defined(LOG_BARRIER) + mempool_start_benchmark(); + mempool_log_barrier(2, core_id); + mempool_stop_benchmark(); + +#elif defined(PLAIN_BARRIER_DELAY) + uint32_t delay = core_delays[core_id]; + mempool_wait(delay); + mempool_start_benchmark(); + mempool_barrier(num_cores); + mempool_stop_benchmark(); + +#elif defined(LOG_BARRIER_DELAY) + uint32_t delay = core_delays[core_id]; + mempool_wait(delay); + mempool_start_benchmark(); + mempool_log_barrier(2, core_id); + mempool_stop_benchmark(); + +#elif defined(LIN_LOG_BARRIER_DELAY) + uint32_t delay = core_delays[core_id]; + mempool_wait(delay); + mempool_start_benchmark(); + mempool_linlog_barrier(4, core_id); + mempool_stop_benchmark(); + + dump_id(core_id); + +#endif + + return 0; +} diff --git a/software/tests/baremetal/test_stride_wu/main.c b/software/tests/baremetal/test_stride_wu/main.c new file mode 100644 index 000000000..43daa50de --- /dev/null +++ b/software/tests/baremetal/test_stride_wu/main.c @@ -0,0 +1,115 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +#include "encoding.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" + +uint32_t volatile sleep1 __attribute__((section(".l1"))); +uint32_t volatile sleep2 __attribute__((section(".l1"))); + +int main() { + + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + + mempool_barrier_init(core_id); + if (core_id == 0) { + sleep1 = 0; + sleep2 = 0; + } + mempool_barrier(num_cores); + + /* WAKE-UP ALL TEST */ + + if (core_id % 4 == 0) { + if (core_id == 0) { + set_wake_up_stride(4U); + set_wake_up_offset(0U); + } + if ((num_cores / 4 - 1) == + __atomic_fetch_add(&sleep1, 1, __ATOMIC_RELAXED)) { + __atomic_store_n(&sleep1, 0, __ATOMIC_RELAXED); + __sync_synchronize(); // Full memory barrier + wake_up_all(); + } + mempool_wfi(); + if (core_id == 0) { + set_wake_up_stride(1U); + set_wake_up_offset(0U); + printf("Cores woken up with stride 4 over the whole cluster \n"); + } + } + // Stops the remaining cores + if ((num_cores - 1) == __atomic_fetch_add(&sleep2, 1, __ATOMIC_RELAXED)) { + __atomic_store_n(&sleep2, 0, __ATOMIC_RELAXED); + __sync_synchronize(); // Full memory barrier + wake_up_all(); + } + mempool_wfi(); + + /* WAKE-UP GROUP TEST */ + + if (core_id < NUM_CORES_PER_GROUP) { + if (core_id % 2 == 0) { + if (core_id == 0) { + set_wake_up_stride(2U); + set_wake_up_offset(0U); + } + if ((NUM_CORES_PER_GROUP / 2 - 1) == + __atomic_fetch_add(&sleep1, 1, __ATOMIC_RELAXED)) { + __atomic_store_n(&sleep1, 0, __ATOMIC_RELAXED); + __sync_synchronize(); // Full memory barrier + wake_up_group(0b0001); + } + mempool_wfi(); + if (core_id == 0) { + set_wake_up_stride(1U); + set_wake_up_offset(0U); + printf("Cores woken up with stride 2 over a group \n"); + } + } + } + // Stops the remaining cores + if ((num_cores - 1) == __atomic_fetch_add(&sleep2, 1, __ATOMIC_RELAXED)) { + __atomic_store_n(&sleep2, 0, __ATOMIC_RELAXED); + __sync_synchronize(); // Full memory barrier + wake_up_all(); + } + mempool_wfi(); + + /* WAKE-UP TILE TEST */ + + if (core_id < NUM_CORES_PER_TILE) { + if (core_id % 2 == 0) { + if (core_id == 0) { + set_wake_up_stride(2U); + set_wake_up_offset(0U); + } + if ((NUM_CORES_PER_TILE / 2 - 1) == + __atomic_fetch_add(&sleep1, 1, __ATOMIC_RELAXED)) { + __atomic_store_n(&sleep1, 0, __ATOMIC_RELAXED); + wake_up_tile(0, 1U); + } + mempool_wfi(); + } + if (core_id == 0) { + set_wake_up_stride(1U); + set_wake_up_offset(0U); + printf("Cores woken up with stride 2 over a tile \n"); + } + } + // Stops the remaining cores + if ((num_cores - 1) == __atomic_fetch_add(&sleep2, 1, __ATOMIC_RELAXED)) { + __atomic_store_n(&sleep2, 0, __ATOMIC_RELAXED); + __sync_synchronize(); // Full memory barrier + wake_up_all(); + } + mempool_wfi(); + + return 0; +}