diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4d4f0945b..7a21bf0c8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -34,6 +34,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Update `register_interface` to 0.4.3
 - Updated Halide to version 15
 - Move instruction cache into its own dependency
+- Add Logk tree barrier and barrier with a fraction of linear and Log2 arrival
+- Add registers to wakeup cores with stride and offset
+- Add barrier with stride and offset
 
 ### Fixed
 - Fix type issue in `snitch_addr_demux`
diff --git a/hardware/scripts/gen_benchmark_table.py b/hardware/scripts/gen_benchmark_table.py
index 520595c12..12e1e90b0 100755
--- a/hardware/scripts/gen_benchmark_table.py
+++ b/hardware/scripts/gen_benchmark_table.py
@@ -18,20 +18,44 @@
 
 def create_dataframe(directory: str):
     keys = ['cycles',
+            'max_cycles',
+            'min_cycles',
+            'std_cycles',
             'snitch_loads',
             'snitch_stores',
             'snitch_avg_load_latency',
             'snitch_occupancy',
             'total_ipc',
-            'snitch_issues   ',
+            'snitch_issues',
+            'max_snitch_issues',
+            'min_snitch_issues',
+            'std_snitch_issues',
             'stall_tot',
+            'max_stall_tot',
+            'min_stall_tot',
+            'std_stall_tot',
             'stall_ins',
+            'max_stall_ins',
+            'min_stall_ins',
+            'std_stall_ins',
             'stall_raw',
+            'max_stall_raw',
+            'min_stall_raw',
+            'std_stall_raw',
             'stall_raw_lsu',
             'stall_raw_acc',
             'stall_lsu',
+            'max_stall_lsu',
+            'min_stall_lsu',
+            'std_stall_lsu',
             'stall_acc',
+            'max_stall_acc',
+            'min_stall_acc',
+            'std_stall_acc',
             'stall_wfi',
+            'max_stall_wfi',
+            'min_stall_wfi',
+            'std_stall_wfi',
             'seq_loads_local',
             'seq_loads_global',
             'itl_loads_local',
@@ -48,22 +72,20 @@ def create_dataframe(directory: str):
     path = os.getcwd()
     df = pd.DataFrame(index=keys)
     for subdir in os.listdir(path):
-        filename = os.path.join(subdir, 'avg.txt')
+        filename = os.path.join(subdir, 'max.txt')
         filetext = open(filename).read()
         values = []
         for key in keys:
             values.append(
                 re.findall(
-                    r'%s\s*[+-]?([0-9]*[.]?[0-9]+)' %
+                    r'\b%s\b\s*[+-]?([0-9]*[.]?[0-9]+)' %
                     (key), filetext))
         df[subdir] = (np.asarray(values)).flatten()
     return df
 
 
 def main():
-
     script_path = pathlib.Path(__file__).parent.absolute()
-
     # Parse arguments
     parser = argparse.ArgumentParser(
         description='Extract performance data from log files')
@@ -89,7 +111,6 @@ def main():
         action='store_true',
         help='Set verbose'
     )
-
     args = parser.parse_args()
     df = create_dataframe(args.input)
     df.to_excel(os.path.join(args.output, 'table.xls'))
diff --git a/hardware/scripts/gen_max.py b/hardware/scripts/gen_max.py
new file mode 100644
index 000000000..7c3f9131c
--- /dev/null
+++ b/hardware/scripts/gen_max.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+
+# Copyright 2022 ETH Zurich and University of Bologna.
+# Solderpad Hardware License, Version 0.51, see LICENSE for details.
+# SPDX-License-Identifier: SHL-0.51
+
+# This script takes a set of .csv files in one of the results folders and
+# generates the average and the max/min performance.
+# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
+
+import os
+import pandas as pd
+import numpy as np
+import argparse
+import sys
+
+ext = ('.csv')
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    '--folder',
+    '-f',
+    help='Name of the results folder with traces to be averaged.'
+)
+args = parser.parse_args()
+
+os.chdir(args.folder)
+path = os.getcwd()
+print(path)
+
+for subdir in os.listdir(path):
+    subdir_path = os.path.join(path, subdir)
+    os.chdir(subdir_path)
+    print(subdir_path)
+    for files in os.listdir(subdir_path):
+        if files.endswith(ext):
+            csvread = pd.read_csv(files)
+            orig_stdout = sys.stdout
+            f = open('max.txt', 'w')
+            sys.stdout = f
+
+            print("\n")
+            print("*******************************")
+            print("**    AVERAGE PERFORMANCE    **")
+            print("*******************************")
+            print("")
+
+            for section in set(csvread['section']):
+                print("Section %d:\n" % section)
+                sectionread = csvread.loc[csvread['section'] == section]
+                keys = csvread.columns
+                remove_keys = ['core',
+                               'section',
+                               'start',
+                               'end',
+                               'snitch_load_latency',
+                               'snitch_load_region',
+                               'snitch_load_tile',
+                               'snitch_store_region',
+                               'snitch_store_tile']
+                keys = keys.drop(remove_keys, errors='raise')
+                for key in keys:
+                    try:
+                        column = sectionread[key].replace(np.nan, 0)
+                        column = column.to_numpy()
+                        avg = np.average(column)
+                        if key in ['cycles',
+                                   'snitch_issues',
+                                   'stall_tot',
+                                   'stall_ins',
+                                   'stall_raw',
+                                   'stall_lsu',
+                                   'stall_acc',
+                                   'stall_wfi']:
+                            max_val = np.max(column)
+                            min_val = np.min(column)
+                            std_val = np.std(column)
+                    except Exception:
+                        # Key could not be averaged
+                        continue
+                    print("%-30s %4.4f" % (key, avg))
+                    if key in ['cycles',
+                               'snitch_issues',
+                               'stall_tot',
+                               'stall_ins',
+                               'stall_raw',
+                               'stall_lsu',
+                               'stall_acc',
+                               'stall_wfi']:
+                        print("%-30s %4.4f" % (('max_' + key), max_val))
+                        print("%-30s %4.4f" % (('min_' + key), min_val))
+                        print("%-30s %4.4f" % (('std_' + key), std_val))
+            sys.stdout = orig_stdout
+            f.close()
+    os.chdir(args.folder)
diff --git a/hardware/src/ctrl_registers.sv b/hardware/src/ctrl_registers.sv
index 189c76fc0..08fa82e86 100644
--- a/hardware/src/ctrl_registers.sv
+++ b/hardware/src/ctrl_registers.sv
@@ -49,21 +49,23 @@ module ctrl_registers
   // [3 :0 ]:eoc_reg                        (rw)
   // [7 :4 ]:wake_up_reg                    (rw)
   // [11:8 ]:wake_up_group_reg              (rw)
-  // [15:12]:tcdm_start_adress_reg          (ro)
-  // [19:16]:tcdm_end_address_reg           (ro)
-  // [23:20]:nr_cores_address_reg           (ro)
-  // [27:24]:ro_cache_enable                (rw)
-  // [31:28]:ro_cache_flush                 (rw)
-  // [35:32]:ro_cache_start_0               (rw)
-  // [39:36]:ro_cache_end_0                 (rw)
-  // [43:40]:ro_cache_start_1               (rw)
-  // [47:44]:ro_cache_end_1                 (rw)
-  // [51:48]:ro_cache_start_2               (rw)
-  // [55:52]:ro_cache_end_2                 (rw)
-  // [59:56]:ro_cache_start_3               (rw)
-  // [63:60]:ro_cache_end_3                 (rw)
-
-  // [95:64]:wake_up_tile[7:0]              (rw)
+  // [15:12]:wake_up_stride_reg             (rw)
+  // [19:16]:wake_up_offset_reg             (rw)
+  // [23:20]:tcdm_start_adress_reg          (ro)
+  // [27:24]:tcdm_end_address_reg           (ro)
+  // [31:28]:nr_cores_address_reg           (ro)
+  // [35:32]:ro_cache_enable                (rw)
+  // [39:36]:ro_cache_flush                 (rw)
+  // [43:40]:ro_cache_start_0               (rw)
+  // [47:44]:ro_cache_end_0                 (rw)
+  // [51:48]:ro_cache_start_1               (rw)
+  // [55:52]:ro_cache_end_1                 (rw)
+  // [59:56]:ro_cache_start_2               (rw)
+  // [63:60]:ro_cache_end_2                 (rw)
+  // [67:64]:ro_cache_start_3               (rw)
+  // [71:68]:ro_cache_end_3                 (rw)
+
+  // [103:72]:wake_up_tile[7:0]              (rw)
 
   localparam logic [MAX_NumGroups*DataWidth-1:0] RegRstVal_TileWakeUp = '{MAX_NumGroups*DataWidth{1'b0}};
   localparam logic [NumRegs-MAX_NumGroups-1:0][DataWidth-1:0] RegRstVal = '{
@@ -81,6 +83,8 @@ module ctrl_registers
     TCDMBaseAddr + TCDMSize,
     TCDMBaseAddr,
     {DataWidth{1'b0}},
+    32'b1,
+    {DataWidth{1'b0}},
     {DataWidth{1'b0}},
     {DataWidth{1'b0}}
   };
@@ -102,6 +106,8 @@ module ctrl_registers
     ReadOnlyReg,
     ReadWriteReg,
     ReadWriteReg,
+    ReadWriteReg,
+    ReadWriteReg,
     ReadWriteReg
   };
 
@@ -111,6 +117,8 @@ module ctrl_registers
   logic [DataWidth-1:0]   eoc;
   logic [DataWidth-1:0]   wake_up;
   logic [DataWidth-1:0]   wake_up_group;
+  logic [DataWidth-1:0]   wake_up_stride;
+  logic [DataWidth-1:0]   wake_up_offset;
   logic [DataWidth-1:0]   tcdm_start_address;
   logic [DataWidth-1:0]   tcdm_end_address;
   logic [DataWidth-1:0]   num_cores;
@@ -125,6 +133,7 @@ module ctrl_registers
   logic [DataWidth-1:0]   ro_cache_start_3;
   logic [DataWidth-1:0]   ro_cache_end_3;
   logic [MAX_NumGroups*DataWidth-1:0] wake_up_tile;
+  logic [NumCores-1:0]   wake_up_mask;
 
   logic [RegNumBytes-1:0] wr_active_d;
   logic [RegNumBytes-1:0] wr_active_q;
@@ -152,7 +161,8 @@ module ctrl_registers
                     ro_cache_end_1, ro_cache_start_1,
                     ro_cache_end_0, ro_cache_start_0,
                     ro_cache_flush, ro_cache_enable,
-                    num_cores, tcdm_end_address, tcdm_start_address, wake_up_group, wake_up, eoc  })
+                    num_cores, tcdm_end_address, tcdm_start_address,
+                    wake_up_offset, wake_up_stride, wake_up_group, wake_up, eoc  })
   );
 
   /***************
@@ -177,12 +187,20 @@ module ctrl_registers
 
   always_comb begin
     wake_up_o = '0;
+    wake_up_mask = '0;
+
+    // create mask for wake_up with stride and offset
+    for(int i = wake_up_offset; i < NumCores; i = i + wake_up_stride) begin
+      wake_up_mask[i] = 1;
+    end
+
     // converts 32 bit wake up to 256 bit
     if (wr_active_q[7:4]) begin
       if (wake_up < NumCores) begin
         wake_up_o = 1 << wake_up;
       end else if (wake_up == {DataWidth{1'b1}}) begin
         wake_up_o = {NumCores{1'b1}};
+        wake_up_o = wake_up_o & wake_up_mask;
       end
     end
     // converts 32 bit group wake up mask to 256 bit core wake up mask
@@ -191,19 +209,21 @@ module ctrl_registers
         for(int i = 0; i < NumGroups; i = i + 1) begin
           wake_up_o[NumCoresPerGroup * i +: NumCoresPerGroup] = {NumCoresPerGroup{wake_up_group[i]}};
         end
+        wake_up_o = wake_up_o & wake_up_mask;
       end else if (wake_up_group == {DataWidth{1'b1}}) begin
         wake_up_o = {NumCores{1'b1}};
+        wake_up_o = wake_up_o & wake_up_mask;
       end
     end
-
     // converts 32 bit tile wake up mask to 256 bit core wake up mask
     for(int i_g = 0; i_g < NumGroups; i_g = i_g + 1) begin
 
-      if (wr_active_q[64 + 4 * i_g +: 4]) begin
+      if (wr_active_q[72 + 4 * i_g +: 4]) begin
         if (wake_up_tile[i_g * DataWidth +: DataWidth] <= {NumTilesPerGroup{1'b1}}) begin
           for (int i = 0; i < NumTilesPerGroup; i = i + 1) begin
             wake_up_o[NumCoresPerGroup * i_g + NumCoresPerTile * i +: NumCoresPerTile] = {NumCoresPerTile{wake_up_tile[i_g * DataWidth + i]}};
           end
+          wake_up_o = wake_up_o & wake_up_mask;
         end
       end
 
diff --git a/hardware/src/mempool_system.sv b/hardware/src/mempool_system.sv
index 7bddbe17e..1339762d4 100644
--- a/hardware/src/mempool_system.sv
+++ b/hardware/src/mempool_system.sv
@@ -718,7 +718,7 @@ module mempool_system
   );
 
   ctrl_registers #(
-    .NumRegs          (16 + 8             ),
+    .NumRegs          (18 + 8             ),
     .TCDMBaseAddr     (TCDMBaseAddr       ),
     .TCDMSize         (TCDMSize           ),
     .NumCores         (NumCores           ),
diff --git a/software/data/data_barriers_test.h.tpl b/software/data/data_barriers_test.h.tpl
new file mode 100644
index 000000000..5b033c43a
--- /dev/null
+++ b/software/data/data_barriers_test.h.tpl
@@ -0,0 +1,18 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+\
+<% def array_to_cstr(array):
+    out = '{'
+    i = 0
+    out += '\n'
+    for a in array:
+        out += '{}, '.format(a)
+        i += 1
+        if i % 8 == 0:
+            out += '\n'
+    out = out[:-2] + '}'
+    return out
+%> \
+
+uint32_t core_delays[${num_cores}] = ${array_to_cstr(delays)};
diff --git a/software/data/data_barriers_test.py b/software/data/data_barriers_test.py
new file mode 100755
index 000000000..59b43f755
--- /dev/null
+++ b/software/data/data_barriers_test.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+
+# Copyright 2022 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+# Author: Marco Bertuletti, ETH Zurich
+
+import numpy as np
+import argparse
+import pathlib
+from mako.template import Template
+
+##################
+# compute_result #
+##################
+
+
+def gen_data_header_file(
+        outdir: pathlib.Path.cwd(),
+        tpl: pathlib.Path.cwd(),
+        **kwargs):
+    file = outdir / f"data_{kwargs['name']}.h"
+    print(tpl, outdir, kwargs['name'])
+    template = Template(filename=str(tpl))
+    with file.open('w') as f:
+        f.write(template.render(**kwargs))
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Generate data for kernels')
+    parser.add_argument(
+        "-o",
+        "--outdir",
+        type=pathlib.Path,
+        default=pathlib.Path(__file__).parent.absolute(),
+        required=False,
+        help='Select out directory of generated data files'
+    )
+    parser.add_argument(
+        "-t",
+        "--tpl",
+        type=pathlib.Path,
+        required=False,
+        default=pathlib.Path(__file__).parent.absolute()
+        / "data_barriers_test.h.tpl",
+        help='Path to mako template'
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action='store_true',
+        help='Set verbose'
+    )
+
+    parser.add_argument(
+        "delay_distribution",
+        nargs='?',
+        type=str,
+        default='uniform')
+    parser.add_argument(
+        "-n",
+        "--num_cores",
+        type=int,
+        required=False,
+        default=1024,
+        help='Number of cores.')
+    parser.add_argument(
+        "-a",
+        type=float,
+        required=False,
+        default=0.5,
+        help='Weybull a.')
+    parser.add_argument(
+        "-d",
+        type=int,
+        required=False,
+        default=1,
+        help='Weybull d.')
+    parser.add_argument(
+        "-m",
+        "--max",
+        type=int,
+        required=False,
+        default=1024,
+        help='Max delay.')
+
+    args = parser.parse_args()
+    num_cores = args.num_cores
+
+    if args.delay_distribution == 'weybull':
+        # Weybull distribution
+        a = args.a_par
+        D = args.d_par
+        delays = D * np.random.weibull(a, size=num_cores)
+        delays = np.asarray(delays, dtype='int')
+    else:
+        # Uniform
+        max_delay = args.max
+        delays = np.random.uniform(low=0.0, high=max_delay, size=num_cores)
+        delays = np.asarray(delays, dtype='int')
+
+    kwargs = {
+        'name': 'barriers_test',
+        'delays': delays,
+        'num_cores': num_cores}
+    gen_data_header_file(args.outdir, args.tpl, **kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/software/runtime/arch.ld.c b/software/runtime/arch.ld.c
index 8d003388b..6f9e8732a 100644
--- a/software/runtime/arch.ld.c
+++ b/software/runtime/arch.ld.c
@@ -35,28 +35,30 @@ SECTIONS {
   eoc_reg                = 0x40000000;
   wake_up_reg            = 0x40000004;
   wake_up_group_reg      = 0x40000008;
-  tcdm_start_address_reg = 0x4000000C;
-  tcdm_end_address_reg   = 0x40000010;
-  nr_cores_address_reg   = 0x40000014;
-  ro_cache_enable        = 0x40000018;
-  ro_cache_flush         = 0x4000001C;
-  ro_cache_start_0       = 0x40000020;
-  ro_cache_end_0         = 0x40000024;
-  ro_cache_start_1       = 0x40000028;
-  ro_cache_end_1         = 0x4000002C;
-  ro_cache_start_2       = 0x40000030;
-  ro_cache_end_2         = 0x40000034;
-  ro_cache_start_3       = 0x40000038;
-  ro_cache_end_3         = 0x4000003C;
-
-  wake_up_tile_g0_reg = 0x40000040;
-  wake_up_tile_g1_reg = 0x40000044;
-  wake_up_tile_g2_reg = 0x40000048;
-  wake_up_tile_g3_reg = 0x4000004C;
-  wake_up_tile_g4_reg = 0x40000050;
-  wake_up_tile_g5_reg = 0x40000054;
-  wake_up_tile_g6_reg = 0x40000058;
-  wake_up_tile_g7_reg = 0x4000005C;
+  wake_up_stride_reg     = 0x4000000C;
+  wake_up_offset_reg     = 0x40000010;
+  tcdm_start_address_reg = 0x40000014;
+  tcdm_end_address_reg   = 0x40000018;
+  nr_cores_address_reg   = 0x4000001C;
+  ro_cache_enable        = 0x40000020;
+  ro_cache_flush         = 0x40000024;
+  ro_cache_start_0       = 0x40000028;
+  ro_cache_end_0         = 0x4000002C;
+  ro_cache_start_1       = 0x40000030;
+  ro_cache_end_1         = 0x40000034;
+  ro_cache_start_2       = 0x40000038;
+  ro_cache_end_2         = 0x4000003C;
+  ro_cache_start_3       = 0x40000040;
+  ro_cache_end_3         = 0x40000044;
+
+  wake_up_tile_g0_reg = 0x40000048;
+  wake_up_tile_g1_reg = 0x4000004C;
+  wake_up_tile_g2_reg = 0x40000050;
+  wake_up_tile_g3_reg = 0x40000054;
+  wake_up_tile_g4_reg = 0x40000058;
+  wake_up_tile_g5_reg = 0x4000005C;
+  wake_up_tile_g6_reg = 0x40000060;
+  wake_up_tile_g7_reg = 0x40000064;
 
   fake_uart              = 0xC0000000;
 }
diff --git a/software/runtime/runtime.h b/software/runtime/runtime.h
index 4abdbd682..1d4df71e4 100644
--- a/software/runtime/runtime.h
+++ b/software/runtime/runtime.h
@@ -27,6 +27,9 @@ extern volatile uint32_t wake_up_tile_g5_reg;
 extern volatile uint32_t wake_up_tile_g6_reg;
 extern volatile uint32_t wake_up_tile_g7_reg;
 
+extern volatile uint32_t wake_up_stride_reg;
+extern volatile uint32_t wake_up_offset_reg;
+
 typedef uint32_t mempool_id_t;
 typedef uint32_t mempool_timer_t;
 
@@ -167,6 +170,13 @@ static inline void wake_up_tile(uint32_t group_id, uint32_t tile_mask) {
   }
 }
 
+static inline void set_wake_up_stride(uint32_t stride) {
+  wake_up_stride_reg = stride;
+}
+static inline void set_wake_up_offset(uint32_t offset) {
+  wake_up_offset_reg = offset;
+}
+
 // Dump a value via CSR
 // This is only supported in simulation and an experimental feature. All writes
 // to unimplemented CSR registers will be dumped by Snitch. This can be
diff --git a/software/runtime/synchronization.c b/software/runtime/synchronization.c
index 88627b4d6..150fa1eed 100644
--- a/software/runtime/synchronization.c
+++ b/software/runtime/synchronization.c
@@ -10,6 +10,15 @@
 #include "runtime.h"
 #include "synchronization.h"
 
+#if NUM_CORES == (16)
+#define LOG2_NUM_CORES (4)
+#elif NUM_CORES == (256)
+#define LOG2_NUM_CORES (8)
+#elif NUM_CORES == (1024)
+#define LOG2_NUM_CORES (10)
+
+#endif
+
 uint32_t volatile barrier __attribute__((section(".l1")));
 uint32_t volatile log_barrier[NUM_CORES * 4]
     __attribute__((aligned(NUM_CORES * 4), section(".l1")));
@@ -20,6 +29,8 @@ void mempool_barrier_init(uint32_t core_id) {
   if (core_id == 0) {
     // Initialize the barrier
     barrier = 0;
+    set_wake_up_stride(1U);
+    set_wake_up_offset(0U);
     wake_up_all();
     mempool_wfi();
   } else {
@@ -33,6 +44,13 @@ void mempool_barrier_init(uint32_t core_id) {
   mempool_barrier(NUM_CORES);
 }
 
+/* PLAIN BARRIER */
+
+/**
+  @brief         Central counter barrier
+  @param[in]     num_cores Number of cores arriving at the barrier
+  @return        none
+*/
 void mempool_barrier(uint32_t num_cores) {
   // Increment the barrier counter
   if ((num_cores - 1) == __atomic_fetch_add(&barrier, 1, __ATOMIC_RELAXED)) {
@@ -45,12 +63,150 @@ void mempool_barrier(uint32_t num_cores) {
   mempool_wfi();
 }
 
+/**
+  @brief         Central counter barrier with stride and offset
+  @param[in]     barrier Pointer to the barrier variable (can be assigned
+  locally depending on the offset)
+  @param[in]     num_cores Number of cores arriving at the barrier
+  @param[in]     stride Stride between cores to wake up
+  @param[in]     offset ID of the first core involved in the barrier
+  @return        none
+*/
+void mempool_strided_barrier(uint32_t *barrier, uint32_t num_cores,
+                             uint32_t stride, uint32_t offset) {
+
+  // Increment the barrier counter
+  if ((num_cores - 1) == __atomic_fetch_add(barrier, 1, __ATOMIC_RELAXED)) {
+    __atomic_store_n(barrier, 0, __ATOMIC_RELAXED);
+    __sync_synchronize(); // Full memory barrier
+    set_wake_up_stride(stride);
+    set_wake_up_offset(offset);
+    wake_up_all();
+    set_wake_up_stride(1U);
+    set_wake_up_offset(0U);
+  }
+  mempool_wfi();
+}
+
+/* LOG BARRIER */
+
+/**
+  @brief         Log2 tree barrier
+  @param[in]     step Step of the logarithmic tree (must be set to 2)
+  @param[in]     core_id ID of the core arriving at the barrier
+  @return        none
+*/
 void mempool_log_barrier(uint32_t step, uint32_t core_id) {
+  uint32_t idx = (step * (core_id / step)) * 4;
+  uint32_t next_step, previous_step;
+  uint32_t num_cores = mempool_get_core_count();
+  previous_step = step >> 1;
+  if ((step - previous_step) ==
+      __atomic_fetch_add(&log_barrier[idx + previous_step - 1], previous_step,
+                         __ATOMIC_RELAXED)) {
+    next_step = step << 1;
+    __atomic_store_n(&log_barrier[idx + previous_step - 1], 0,
+                     __ATOMIC_RELAXED);
+    if (num_cores == step) {
+      __sync_synchronize(); // Full memory barrier
+      wake_up_all();
+      mempool_wfi();
+    } else {
+      mempool_log_barrier(next_step, core_id);
+    }
+  } else
+    mempool_wfi();
+}
+
+/**
+  @brief         Tree barrier with any radix. In each step a central counter
+  barrier is used.
+  @param[in]     radix log2(barrier radix), e.g. radix 2 -> 1
+  @param[in]     core_id ID of the core arriving at the barrier
+  @return        none
+*/
+void mempool_anyradixlog_barrier(uint32_t radix, uint32_t core_id) {
+  uint32_t num_cores = mempool_get_core_count();
+  uint32_t first_step = (LOG2_NUM_CORES % radix) == 0
+                            ? (1U << radix)
+                            : 1U << (LOG2_NUM_CORES % radix);
+  uint32_t step = 0, previous_step = 0;
+  // At first step you take care of the remainder
+  uint32_t idx = (first_step * (core_id / first_step)) * 4;
+  if ((first_step - 1) ==
+      __atomic_fetch_add(&log_barrier[idx], 1, __ATOMIC_RELAXED)) {
+    __atomic_store_n(&log_barrier[idx], 0, __ATOMIC_RELAXED);
+    num_cores /= first_step;
+    previous_step = first_step;
+    step = (first_step << radix);
+    // Following steps proceed with the radix chosen
+    while (num_cores > 1U) {
+      idx = (step * (core_id / step)) * 4;
+      if ((step - previous_step) ==
+          __atomic_fetch_add(&log_barrier[idx + previous_step - 1],
+                             previous_step, __ATOMIC_RELAXED)) {
+        __atomic_store_n(&log_barrier[idx + previous_step - 1], 0,
+                         __ATOMIC_RELAXED);
+        num_cores >>= radix;
+        previous_step = step;
+        step <<= radix;
+      } else {
+        break;
+      }
+    }
+    // Last core wakes-up everyone
+    if (num_cores == 1U) {
+      __sync_synchronize(); // Full memory barrier
+      wake_up_all();
+    }
+  }
+  mempool_wfi();
+}
+
+/**
+  @brief         Central counter barrier on a subset of cores + log2 tree
+  barrier
+  @param[in]     step Number of cores in central counter barrier
+  @param[in]     core_id ID of the core arriving at the barrier
+  @return        none
+*/
+void mempool_linlog_barrier(uint32_t step, uint32_t core_id) {
 
   uint32_t idx = (step * (core_id / step)) * 4;
   uint32_t next_step, previous_step;
   uint32_t num_cores = mempool_get_core_count();
 
+  previous_step = step >> 1;
+  if ((step - 1) == __atomic_fetch_add(&log_barrier[idx + previous_step - 1], 1,
+                                       __ATOMIC_RELAXED)) {
+    next_step = step << 1;
+    __atomic_store_n(&log_barrier[idx + previous_step - 1], 0,
+                     __ATOMIC_RELAXED);
+    if (num_cores == step) {
+      __sync_synchronize(); // Full memory barrier
+      wake_up_all();
+      mempool_wfi();
+    } else {
+      mempool_log_barrier(next_step, core_id);
+    }
+  } else
+    mempool_wfi();
+}
+
+/**
+  @brief         Log2 tree barrier with stride and offset
+  @param[in]     step Step of the logarithmic tree (must be set to 2)
+  @param[in]     stride Stride between cores to wake up
+  @param[in]     offset ID of the first core involved in the barrier
+  @return        none
+*/
+void mempool_strided_log_barrier(uint32_t step, uint32_t core_id,
+                                 uint32_t stride, uint32_t offset) {
+
+  uint32_t idx = (step * (core_id / step)) * 4 + offset;
+  uint32_t next_step, previous_step;
+  uint32_t num_cores = mempool_get_core_count();
+
   previous_step = step >> 1;
   if ((step - previous_step) ==
       __atomic_fetch_add(&log_barrier[idx + previous_step - 1], previous_step,
@@ -60,7 +216,11 @@ void mempool_log_barrier(uint32_t step, uint32_t core_id) {
                      __ATOMIC_RELAXED);
     if (num_cores == step) {
       __sync_synchronize(); // Full memory barrier
+      set_wake_up_stride(stride);
+      set_wake_up_offset(offset);
       wake_up_all();
+      set_wake_up_stride(1U);
+      set_wake_up_offset(0U);
       mempool_wfi();
     } else {
       mempool_log_barrier(next_step, core_id);
@@ -69,6 +229,15 @@ void mempool_log_barrier(uint32_t step, uint32_t core_id) {
     mempool_wfi();
 }
 
+/* PARTIAL BARRIER */
+
+/**
+  @brief         Log2 tree barrier on a subset of cores
+  @param[in]     step Step of the logarithmic tree (must be set to 2)
+  @param[in]     core_id ID of the first core involved in the barrier
+  @param[in]     num_cores_barrier Number of cores involved in the barrier
+  @return        none
+*/
 void mempool_log_partial_barrier(uint32_t step, uint32_t core_id,
                                  uint32_t num_cores_barrier) {
 
@@ -119,6 +288,14 @@ void mempool_log_partial_barrier(uint32_t step, uint32_t core_id,
   }
 }
 
+/**
+  @brief         Central counter barrier on a subset of cores
+  @param[in]     core_id  ID of the first core involved in the barrier
+  @param[in]     core_init First core involved in the barrier
+  @param[in]     num_sleeping_cores Number of cores involved in the barrier
+  @param[in]     memloc Location of the barrier variable
+  @return        none
+*/
 void mempool_partial_barrier(uint32_t volatile core_id,
                              uint32_t volatile core_init,
                              uint32_t volatile num_sleeping_cores,
diff --git a/software/runtime/synchronization.h b/software/runtime/synchronization.h
index 80b79428f..75f8ce55c 100644
--- a/software/runtime/synchronization.h
+++ b/software/runtime/synchronization.h
@@ -11,8 +11,10 @@
 void mempool_barrier_init(uint32_t core_id);
 void mempool_barrier(uint32_t num_cores);
 void mempool_log_barrier(uint32_t step, uint32_t core_id);
+void mempool_anyradixlog_barrier(uint32_t radix, uint32_t core_id);
 void mempool_log_partial_barrier(uint32_t step, uint32_t core_id,
                                  uint32_t num_cores_barrier);
+void mempool_linlog_barrier(uint32_t step, uint32_t core_id);
 void mempool_partial_barrier(uint32_t volatile core_id,
                              uint32_t volatile core_init,
                              uint32_t volatile num_sleeping_cores,
diff --git a/software/tests/baremetal/barriers_test/main.c b/software/tests/baremetal/barriers_test/main.c
new file mode 100644
index 000000000..de3222240
--- /dev/null
+++ b/software/tests/baremetal/barriers_test/main.c
@@ -0,0 +1,64 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "encoding.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+
+#include "data_barriers_test.h"
+#define LIN_LOG_BARRIER_DELAY
+
+dump(id, 1);
+
+int main() {
+
+  uint32_t volatile core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  mempool_barrier_init(core_id);
+  mempool_barrier(num_cores);
+
+#if defined(PLAIN_BARRIER)
+  mempool_start_benchmark();
+  mempool_barrier(num_cores);
+  mempool_stop_benchmark();
+
+#elif defined(LOG_BARRIER)
+  mempool_start_benchmark();
+  mempool_log_barrier(2, core_id);
+  mempool_stop_benchmark();
+
+#elif defined(PLAIN_BARRIER_DELAY)
+  uint32_t delay = core_delays[core_id];
+  mempool_wait(delay);
+  mempool_start_benchmark();
+  mempool_barrier(num_cores);
+  mempool_stop_benchmark();
+
+#elif defined(LOG_BARRIER_DELAY)
+  uint32_t delay = core_delays[core_id];
+  mempool_wait(delay);
+  mempool_start_benchmark();
+  mempool_log_barrier(2, core_id);
+  mempool_stop_benchmark();
+
+#elif defined(LIN_LOG_BARRIER_DELAY)
+  uint32_t delay = core_delays[core_id];
+  mempool_wait(delay);
+  mempool_start_benchmark();
+  mempool_linlog_barrier(4, core_id);
+  mempool_stop_benchmark();
+
+  dump_id(core_id);
+
+#endif
+
+  return 0;
+}
diff --git a/software/tests/baremetal/test_stride_wu/main.c b/software/tests/baremetal/test_stride_wu/main.c
new file mode 100644
index 000000000..43daa50de
--- /dev/null
+++ b/software/tests/baremetal/test_stride_wu/main.c
@@ -0,0 +1,115 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#include "encoding.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+
+uint32_t volatile sleep1 __attribute__((section(".l1")));
+uint32_t volatile sleep2 __attribute__((section(".l1")));
+
+int main() {
+
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+
+  mempool_barrier_init(core_id);
+  if (core_id == 0) {
+    sleep1 = 0;
+    sleep2 = 0;
+  }
+  mempool_barrier(num_cores);
+
+  /* WAKE-UP ALL TEST */
+
+  if (core_id % 4 == 0) {
+    if (core_id == 0) {
+      set_wake_up_stride(4U);
+      set_wake_up_offset(0U);
+    }
+    if ((num_cores / 4 - 1) ==
+        __atomic_fetch_add(&sleep1, 1, __ATOMIC_RELAXED)) {
+      __atomic_store_n(&sleep1, 0, __ATOMIC_RELAXED);
+      __sync_synchronize(); // Full memory barrier
+      wake_up_all();
+    }
+    mempool_wfi();
+    if (core_id == 0) {
+      set_wake_up_stride(1U);
+      set_wake_up_offset(0U);
+      printf("Cores woken up with stride 4 over the whole cluster \n");
+    }
+  }
+  // Stops the remaining cores
+  if ((num_cores - 1) == __atomic_fetch_add(&sleep2, 1, __ATOMIC_RELAXED)) {
+    __atomic_store_n(&sleep2, 0, __ATOMIC_RELAXED);
+    __sync_synchronize(); // Full memory barrier
+    wake_up_all();
+  }
+  mempool_wfi();
+
+  /* WAKE-UP GROUP TEST */
+
+  if (core_id < NUM_CORES_PER_GROUP) {
+    if (core_id % 2 == 0) {
+      if (core_id == 0) {
+        set_wake_up_stride(2U);
+        set_wake_up_offset(0U);
+      }
+      if ((NUM_CORES_PER_GROUP / 2 - 1) ==
+          __atomic_fetch_add(&sleep1, 1, __ATOMIC_RELAXED)) {
+        __atomic_store_n(&sleep1, 0, __ATOMIC_RELAXED);
+        __sync_synchronize(); // Full memory barrier
+        wake_up_group(0b0001);
+      }
+      mempool_wfi();
+      if (core_id == 0) {
+        set_wake_up_stride(1U);
+        set_wake_up_offset(0U);
+        printf("Cores woken up with stride 2 over a group \n");
+      }
+    }
+  }
+  // Stops the remaining cores
+  if ((num_cores - 1) == __atomic_fetch_add(&sleep2, 1, __ATOMIC_RELAXED)) {
+    __atomic_store_n(&sleep2, 0, __ATOMIC_RELAXED);
+    __sync_synchronize(); // Full memory barrier
+    wake_up_all();
+  }
+  mempool_wfi();
+
+  /* WAKE-UP TILE TEST */
+
+  if (core_id < NUM_CORES_PER_TILE) {
+    if (core_id % 2 == 0) {
+      if (core_id == 0) {
+        set_wake_up_stride(2U);
+        set_wake_up_offset(0U);
+      }
+      if ((NUM_CORES_PER_TILE / 2 - 1) ==
+          __atomic_fetch_add(&sleep1, 1, __ATOMIC_RELAXED)) {
+        __atomic_store_n(&sleep1, 0, __ATOMIC_RELAXED);
+        wake_up_tile(0, 1U);
+      }
+      mempool_wfi();
+    }
+    if (core_id == 0) {
+      set_wake_up_stride(1U);
+      set_wake_up_offset(0U);
+      printf("Cores woken up with stride 2 over a tile \n");
+    }
+  }
+  // Stops the remaining cores
+  if ((num_cores - 1) == __atomic_fetch_add(&sleep2, 1, __ATOMIC_RELAXED)) {
+    __atomic_store_n(&sleep2, 0, __ATOMIC_RELAXED);
+    __sync_synchronize(); // Full memory barrier
+    wake_up_all();
+  }
+  mempool_wfi();
+
+  return 0;
+}